In [1]:
import pandas as pd, numpy as np, seaborn as sns, warnings, os, time
from datetime import datetime as dt
from matplotlib import pyplot as plt
import matplotlib.font_manager as fm
import plotly.express as px
import plotly.graph_objs as go
font1 = fm.FontProperties(size=20)
font2 = fm.FontProperties(size=24)
warnings.filterwarnings(action="ignore")
if int(str(sns.__version__).split('.')[1]) > 8 :
plt.style.use('seaborn-v0_8-darkgrid')
else:
plt.style.use('seaborn-darkgrid')
sns.set(font_scale=2)
In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
In [3]:
from matplotlib.colors import ListedColormap
from itertools import cycle
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import NuSVC, SVC, OneClassSVM
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, RationalQuadratic, ExpSineSquared, DotProduct, Matern, WhiteKernel
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer, confusion_matrix, roc_curve, auc, accuracy_score, log_loss, hamming_loss, \
precision_score, recall_score, f1_score, fbeta_score, jaccard_score, \
precision_recall_curve, average_precision_score, balanced_accuracy_score, \
classification_report
from sklearn.metrics import roc_auc_score, zero_one_loss
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold, \
RepeatedStratifiedKFold, LeavePOut, LeaveOneGroupOut, \
LeavePGroupsOut, ShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
Outils du document¶
In [4]:
palette = [ "#030aa7", "#e50000", "#d8863b", "#005f6a", "#6b7c85", "#751973",
"#0485d1", "#ff7855", "#fbeeac", "#0cb577", "#95a3a6", "#c071fe",
"#d1e5f0", "#fddbc7", "#ffffcb", "#12e193", "#d8dcd6", "#dfc5fe",
]
sns.palplot(sns.color_palette(palette))
In [5]:
repertoireRacine = '.'
nomProjet = 'Pinguins-Palmer Archipelago'
repertoireProjet = os.path.join(repertoireRacine, nomProjet)
repertoireDonnees = os.path.join(repertoireProjet, 'repertoire.donnees')
repertoireImages = os.path.join(repertoireProjet, 'repertoire.images')
def controleExistenceRepertoire( repertoire, create_if_needed=True):
"""Voir si le répertoire existe. S'il n'existe pas il est créé."""
path_exists = os.path.exists(repertoire)
if path_exists:
if not os.path.isdir(repertoire):
raise Exception("Trouvé le nom "+repertoire +" mais c'est un fichier, pas un répertoire")
# return False
return True
if create_if_needed:
os.makedirs(repertoire)
def sauvegarderImage( fichier):
"""Enregistrez la figure. Appelez la méthode juste avant plt.show ()."""
controleExistenceRepertoire(repertoireImages)
plt.savefig(os.path.join(repertoireImages,
fichier+f"--{dt.now().strftime('%Y_%m_%d_%H.%M.%S')}.png"),
dpi=600,
bbox_inches='tight')
def sauvegarderImageSNS( sns_plot, fichier):
"""Enregistrez la figure. Appelez la méthode juste avant plt.show ()."""
controleExistenceRepertoire(repertoireImages)
fig = sns_plot.get_figure()
fig.savefig(os.path.join(repertoireImages,fichier+'.png'))
controleExistenceRepertoire(repertoireProjet);
controleExistenceRepertoire(repertoireDonnees);
controleExistenceRepertoire(repertoireImages);
In [6]:
def formatPct(pct, allvals):
total = int(round(pct/100. * np.sum(allvals)))
return "{:.2f}%\n({:d})".format(pct, total)
In [7]:
def affichageDistribution(colonne,couleur,ax):
graph = sns.distplot(colonne, color=couleur, ax=ax)
graph.set(ylabel=None)
moyenne, mediane = float(colonne.mean()), \
float(colonne.median())
ax.axvline(moyenne, color='g', linestyle='-', label="mean = {0:0.1f}".format(moyenne), lw=2)
ax.axvline(mediane, color='b', linestyle='--', label="median = {0:0.1f}".format(mediane), lw=2)
graph.legend(loc="upper right")
In [8]:
def afficheDendrogram(*args, **kwargs):
max_d = kwargs.pop('max_d', None)
if max_d and 'color_threshold' not in kwargs:
kwargs['color_threshold'] = max_d
annotate_above = kwargs.pop('annotate_above', 0)
ddata = dendrogram(*args, **kwargs)
if not kwargs.get('no_plot', False):
plt.title('Classification Hiérarchique Ascendante')
plt.xlabel('Villes ou (taille du cluster)')
plt.ylabel('Distance')
for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
x = 0.5 * sum(i[1:3])
y = d[1]
if y > annotate_above:
plt.plot(x, y, 'o', c=c)
plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
textcoords='offset points',
va='top', ha='center')
if max_d:
plt.axhline(y=max_d, c='k')
return ddata
Lecture des données¶
|
![]() |
|---|
In [9]:
donnees = pd.read_csv('../donnees/Palmer Archipelago-Antarctica-Penguin/penguins_size.csv')
In [10]:
quantitatives = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
qualitatives = ['island', 'sex']
cible = 'species'
Effacement des valeurs non renseignés¶
In [11]:
donnees[donnees.body_mass_g.isna()]
Out[11]:
| species | island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
| 339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN |
Je pense que je peux supprimer les lignes avec les entrées nulles sans causer de problèmes majeurs.
In [12]:
donnees = donnees[~donnees.body_mass_g.isna()]
donnees.info()
<class 'pandas.core.frame.DataFrame'> Index: 342 entries, 0 to 343 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 species 342 non-null object 1 island 342 non-null object 2 culmen_length_mm 342 non-null float64 3 culmen_depth_mm 342 non-null float64 4 flipper_length_mm 342 non-null float64 5 body_mass_g 342 non-null float64 6 sex 334 non-null object dtypes: float64(4), object(3) memory usage: 21.4+ KB
In [13]:
donnees.duplicated().sum()
Out[13]:
0
Variables qualitatives¶
In [14]:
qualitatives = ['island', 'sex']
Espèces de manchots¶
In [15]:
donnees.species.sort_values().unique()
Out[15]:
array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)
In [16]:
dict_species = {nom:i for i, nom in enumerate(donnees.species.sort_values().unique())}
dictR_species = {i:nom for i, nom in enumerate(donnees.species.sort_values().unique())}
dict_species,dictR_species
Out[16]:
({'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2},
{0: 'Adelie', 1: 'Chinstrap', 2: 'Gentoo'})
In [17]:
donnees['espece'] = donnees.species
donnees.species = donnees.species.apply(lambda x: dict_species[x])
Sexe du manchot¶
In [18]:
donnees.sex.sort_values().unique()
Out[18]:
array(['.', 'FEMALE', 'MALE', nan], dtype=object)
In [19]:
donnees[(donnees.sex == '.' )|(donnees.sex.isna() ) ]
Out[19]:
| species | island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex | espece | |
|---|---|---|---|---|---|---|---|---|
| 8 | 0 | Torgersen | 34.1 | 18.1 | 193.0 | 3475.0 | NaN | Adelie |
| 9 | 0 | Torgersen | 42.0 | 20.2 | 190.0 | 4250.0 | NaN | Adelie |
| 10 | 0 | Torgersen | 37.8 | 17.1 | 186.0 | 3300.0 | NaN | Adelie |
| 11 | 0 | Torgersen | 37.8 | 17.3 | 180.0 | 3700.0 | NaN | Adelie |
| 47 | 0 | Dream | 37.5 | 18.9 | 179.0 | 2975.0 | NaN | Adelie |
| 246 | 2 | Biscoe | 44.5 | 14.3 | 216.0 | 4100.0 | NaN | Gentoo |
| 286 | 2 | Biscoe | 46.2 | 14.4 | 214.0 | 4650.0 | NaN | Gentoo |
| 324 | 2 | Biscoe | 47.3 | 13.8 | 216.0 | 4725.0 | NaN | Gentoo |
| 336 | 2 | Biscoe | 44.5 | 15.7 | 217.0 | 4875.0 | . | Gentoo |
In [20]:
donnees.sex = donnees.sex.apply(lambda x : None if x == '.' else x).fillna('none').apply(lambda x: str(x).lower())
donnees.sex.sort_values().unique()
Out[20]:
array(['female', 'male', 'none'], dtype=object)
In [21]:
dict_sex = {nom:i for i, nom in enumerate(donnees.sex.sort_values().unique())}
dictR_sex = {i:nom for i, nom in enumerate(donnees.sex.sort_values().unique())}
dict_sex,dictR_sex
Out[21]:
({'female': 0, 'male': 1, 'none': 2}, {0: 'female', 1: 'male', 2: 'none'})
In [22]:
donnees['sexe'] = donnees.sex
donnees.sex = donnees.sex.apply(lambda x: dict_sex[x])
Nom de l'île¶
In [23]:
donnees.island.sort_values().unique()
Out[23]:
array(['Biscoe', 'Dream', 'Torgersen'], dtype=object)
In [24]:
dict_island = {nom:i for i, nom in enumerate(donnees.island.sort_values().unique())}
dictR_island = {i:nom for i, nom in enumerate(donnees.island.sort_values().unique())}
dict_island,dictR_island
Out[24]:
({'Biscoe': 0, 'Dream': 1, 'Torgersen': 2},
{0: 'Biscoe', 1: 'Dream', 2: 'Torgersen'})
In [25]:
donnees['nom_ile'] = donnees.island
donnees.island = donnees.island.apply(lambda x: dict_island[x])
In [26]:
qualitativesT = ['espece', 'sexe', 'nom_ile']
Statistiques descriptives et analyse de données¶
Couleurs variables qualitatives¶
In [27]:
couleursEspece = {nom:couleur for nom,couleur in zip(donnees.espece.sort_values().unique(),["#751973","#005f6a","#d8863b"])}
couleursSexe = {nom:couleur for nom,couleur in zip(donnees.sexe.sort_values().unique(),["#0485d1","#ff7855","#95a3a6"])}
couleursNomIle = {nom:couleur for nom,couleur in zip(donnees.nom_ile.sort_values().unique(),["#d1e5f0", "#fddbc7","#dfc5fe"])}
sns.palplot(sns.color_palette(couleursEspece.values()))
sns.palplot(sns.color_palette(couleursSexe.values()))
sns.palplot(sns.color_palette(couleursNomIle.values()))
Statistiques descriptives¶
In [28]:
donnees.sample(5)
Out[28]:
| species | island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex | espece | sexe | nom_ile | |
|---|---|---|---|---|---|---|---|---|---|---|
| 276 | 2 | 0 | 43.8 | 13.9 | 208.0 | 4300.0 | 0 | Gentoo | female | Biscoe |
| 290 | 2 | 0 | 47.7 | 15.0 | 216.0 | 4750.0 | 0 | Gentoo | female | Biscoe |
| 223 | 2 | 0 | 50.0 | 15.2 | 218.0 | 5700.0 | 1 | Gentoo | male | Biscoe |
| 68 | 0 | 2 | 35.9 | 16.6 | 190.0 | 3050.0 | 0 | Adelie | female | Torgersen |
| 5 | 0 | 2 | 39.3 | 20.6 | 190.0 | 3650.0 | 1 | Adelie | male | Torgersen |
In [29]:
donnees.drop(columns=qualitativesT).describe().style.format("{:0.2f}") #.background_gradient(cmap=plt.get_cmap('Blues'),axis=0)
Out[29]:
| species | island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| count | 342.00 | 342.00 | 342.00 | 342.00 | 342.00 | 342.00 | 342.00 |
| mean | 0.92 | 0.66 | 43.92 | 17.15 | 200.92 | 4201.75 | 0.54 |
| std | 0.89 | 0.72 | 5.46 | 1.97 | 14.06 | 801.95 | 0.55 |
| min | 0.00 | 0.00 | 32.10 | 13.10 | 172.00 | 2700.00 | 0.00 |
| 25% | 0.00 | 0.00 | 39.23 | 15.60 | 190.00 | 3550.00 | 0.00 |
| 50% | 1.00 | 1.00 | 44.45 | 17.30 | 197.00 | 4050.00 | 1.00 |
| 75% | 2.00 | 1.00 | 48.50 | 18.70 | 213.00 | 4750.00 | 1.00 |
| max | 2.00 | 2.00 | 59.60 | 21.50 | 231.00 | 6300.00 | 2.00 |
In [30]:
donnees.columns
Out[30]:
Index(['species', 'island', 'culmen_length_mm', 'culmen_depth_mm',
'flipper_length_mm', 'body_mass_g', 'sex', 'espece', 'sexe', 'nom_ile'],
dtype='object')
Structure de l’échantillon des données ¶
In [31]:
donnees.sample(5)
Out[31]:
| species | island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex | espece | sexe | nom_ile | |
|---|---|---|---|---|---|---|---|---|---|---|
| 90 | 0 | 1 | 35.7 | 18.0 | 202.0 | 3550.0 | 0 | Adelie | female | Dream |
| 51 | 0 | 0 | 40.1 | 18.9 | 188.0 | 4300.0 | 1 | Adelie | male | Biscoe |
| 59 | 0 | 0 | 37.6 | 19.1 | 194.0 | 3750.0 | 1 | Adelie | male | Biscoe |
| 58 | 0 | 0 | 36.5 | 16.6 | 181.0 | 2850.0 | 0 | Adelie | female | Biscoe |
| 67 | 0 | 0 | 41.1 | 19.1 | 188.0 | 4100.0 | 1 | Adelie | male | Biscoe |
In [32]:
affichage = donnees.groupby(['espece','sexe']).sex.count().reset_index().rename(columns={'sex':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage.pivot_table(
index='espece',
columns='sexe',
values='nombre',
# fill_value=0
).style.format("{:0.2f}").background_gradient(cmap=plt.get_cmap('Blues'),axis=0)
Out[32]:
| sexe | female | male | none |
|---|---|---|---|
| espece | |||
| Adelie | 73.00 | 73.00 | 5.00 |
| Chinstrap | 34.00 | 34.00 | nan |
| Gentoo | 58.00 | 61.00 | 4.00 |
In [33]:
affichage = donnees.groupby(['espece','nom_ile']).sex.count().reset_index().rename(columns={'sex':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage.pivot_table(
index='espece',
columns='nom_ile',
values='nombre',
# fill_value=0
).style.format("{:0.2f}").background_gradient(cmap=plt.get_cmap('Blues'),axis=0)
Out[33]:
| nom_ile | Biscoe | Dream | Torgersen |
|---|---|---|---|
| espece | |||
| Adelie | 44.00 | 56.00 | 51.00 |
| Chinstrap | nan | 68.00 | nan |
| Gentoo | 123.00 | nan | nan |
In [34]:
affichage = donnees.groupby(['nom_ile','espece','sexe']).sex.count().reset_index().rename(columns={'sex':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage.pivot_table(
index=['espece','sexe'],
columns='nom_ile',
values='nombre',
# fill_value=0
).style.format("{:0.2f}").background_gradient(cmap=plt.get_cmap('Blues'),axis=0)
Out[34]:
| nom_ile | Biscoe | Dream | Torgersen | |
|---|---|---|---|---|
| espece | sexe | |||
| Adelie | female | 22.00 | 27.00 | 24.00 |
| male | 22.00 | 28.00 | 23.00 | |
| none | nan | 1.00 | 4.00 | |
| Chinstrap | female | nan | 34.00 | nan |
| male | nan | 34.00 | nan | |
| Gentoo | female | 58.00 | nan | nan |
| male | 61.00 | nan | nan | |
| none | 4.00 | nan | nan |
In [35]:
radius,size=0.8,0.3
fig,(ax0,ax1,ax2) = plt.subplots(ncols=3,figsize=(92,36), subplot_kw=dict(aspect="equal"))
affichage = donnees.groupby(['espece']).sex.count().reset_index().rename(columns={'sex':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage
wedges, texts, autotexts = ax0.pie(
affichage['nombre'],
autopct=lambda pct: formatPct(pct, affichage.nombre.values), # autopct='%1.2f%%',
labels=affichage['espece'].values,
# shadow=True,
counterclock=False,
startangle=0 ,
colors = couleursEspece.values(),
# pctdistance=0.4,
labeldistance=1.1,
textprops=dict(color="#030aa7"),
explode=(0.03,0.03,0.03)
);
plt.setp(autotexts, size=48, weight="bold",color="w")
plt.setp(texts, size=64, weight="bold")
affichage = donnees.groupby(['espece']).sex.count().reset_index().rename(columns={'sex':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage.sort_values(['espece'],inplace=True)
wedges, texts, autotexts = ax1.pie(
affichage['nombre'],
autopct=lambda pct: formatPct(pct, affichage.nombre.values), # autopct='%1.2f%%',
labels=affichage['espece'].values,
counterclock=False,
wedgeprops=dict(width=0.3),
pctdistance=0.8,
radius=radius,
colors=couleursEspece.values(),
# wedgeprops=dict(width=size, edgecolor='w'),
textprops=dict(color='#053061',fontsize='x-large'),
startangle=0 ,)
plt.setp(autotexts, size=48, weight="bold",color="w")
plt.setp(texts, size=64, weight="bold")
affichage = donnees.groupby(['espece','sexe']).sex.count().reset_index().rename(columns={'sex':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage.sort_values(['espece','sexe'],inplace=True)
wedges, texts, autotexts = ax1.pie(
affichage['nombre'],
autopct=lambda pct: formatPct(pct, affichage.nombre.values), # autopct='%1.2f%%',
# labels=affichage['sexe'].values,
counterclock=False,
pctdistance=0.7,
radius=radius-size,
colors=affichage['sexe'].apply(lambda x : couleursSexe[x]),
wedgeprops=dict(width=size, edgecolor='w'),
textprops=dict(color="#053061"),
startangle=0 ,)
plt.setp(autotexts, size=32, weight="bold")
plt.setp(texts, size=32, weight="bold",color="w")
ax1.legend(wedges, affichage['sexe'].unique(),
title="Sexe",
loc="upper left",
fontsize ='x-large',
title_fontsize='x-large',
bbox_to_anchor=(0, 1))
ax1.set_title("Sexe", size=96);
affichage = donnees.groupby(['espece']).nom_ile.count().reset_index().rename(columns={'nom_ile':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage.sort_values(['espece'],inplace=True)
colorsSex = {'Biscoe':"#d1e5f0", 'Dream':"#fddbc7", 'Torgersen':"#dfc5fe"}
wedges, texts, autotexts = ax2.pie(
affichage['nombre'],
autopct=lambda pct: formatPct(pct, affichage.nombre.values), # autopct='%1.2f%%',
labels=affichage['espece'].values,
counterclock=False,
wedgeprops=dict(width=0.3),
pctdistance=0.8,
radius=radius,
colors=couleursEspece.values(),
# wedgeprops=dict(width=size, edgecolor='w'),
textprops=dict(color='#053061',fontsize='x-large'),
startangle=0 ,)
plt.setp(autotexts, size=48, weight="bold",color="w")
plt.setp(texts, size=64, weight="bold")
affichage = donnees.groupby(['espece','nom_ile']).island.count().reset_index().rename(columns={'island':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage.sort_values(['espece','nom_ile'],inplace=True)
wedges, texts, autotexts = ax2.pie(
affichage['nombre'],
autopct=lambda pct: formatPct(pct, affichage.nombre.values), # autopct='%1.2f%%',
# labels=affichage['nom_ile'].values,
counterclock=False,
pctdistance=0.7,
radius=radius-size,
colors=affichage['nom_ile'].apply(lambda x : couleursNomIle[x]),
wedgeprops=dict(width=size, edgecolor='w'),
textprops=dict(color="#053061"),
startangle=0 ,)
plt.setp(autotexts, size=32, weight="bold")
plt.setp(texts, size=32, weight="bold",color="w")
ax2.legend(wedges, affichage['nom_ile'].unique(),
title="Nom de l’île",
loc="upper left",
fontsize ='x-large',
title_fontsize='x-large',
bbox_to_anchor=(0, 1))
ax2.set_title("Location géographique", size=96);
fig.suptitle("Espèces de pingouins",fontsize=128);
plt.tight_layout()
# fig.set_facecolor("#ffffcb")
In [36]:
affichage = donnees.groupby(['nom_ile','espece','sexe']).sex.count().reset_index().rename(columns={'sex':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage
fig = go.Figure(px.treemap(affichage,
path=[px.Constant("Île Anvers"), 'nom_ile', 'espece','sexe'], values='nombre',
color='nombre',
hover_data=['nom_ile', 'espece','sexe'],
color_continuous_scale='RdBu',
color_continuous_midpoint=affichage['nombre'].mean(),
width=1152,
height=768
))
fig.show()
Distribution de l’échantillon des données ¶
In [37]:
donnees.sample(5)
Out[37]:
| species | island | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex | espece | sexe | nom_ile | |
|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 0 | 0 | 40.5 | 17.9 | 187.0 | 3200.0 | 0 | Adelie | female | Biscoe |
| 149 | 0 | 1 | 37.8 | 18.1 | 193.0 | 3750.0 | 1 | Adelie | male | Dream |
| 267 | 2 | 0 | 50.5 | 15.9 | 225.0 | 5400.0 | 1 | Gentoo | male | Biscoe |
| 289 | 2 | 0 | 50.7 | 15.0 | 223.0 | 5550.0 | 1 | Gentoo | male | Biscoe |
| 92 | 0 | 1 | 34.0 | 17.1 | 185.0 | 3400.0 | 0 | Adelie | female | Dream |
In [38]:
graph = sns.pairplot(
donnees.drop(columns=qualitatives+[cible]),
hue='espece',
size=16,
aspect=1,
palette=couleursEspece.values(),
plot_kws={"s": 1200,"alpha":0.6},
markers=["o", "s", "^"],
# corner=True,
diag_kind="kde")
graph.map_upper(sns.kdeplot, levels=24, color=".2");
graph._legend.remove()
graph.add_legend(fontsize='xx-large', title_fontsize='xx-large');
sauvegarderImage('Distribution de l’échantillon des données')

